This notebook processes multiple CSV files containing time-series amplitude measurements and combines them into a single dataset. The code loops through all the CSVs in the same folder that contain 'time_' in the file names (that look like below left in Excel) and combines them into a file called raw_data.csv (that looks like below right in Excel). It also generates and saves to first/last reading images (one truncated and one untruncated).

image.png image-2.png

In [1]:
generate_plot = True
In [2]:
# imports
import pandas as pd
import os
import shutil
import glob
from re import findall, compile
import datetime
import ctypes
In [3]:
# define working directory
try:
    workingDirectory = os.path.dirname(__file__)
except:
    workingDirectory = str(os.getcwd())
In [4]:
# create list all the CSVs in the folder containing 'time_' in file name
os.chdir(workingDirectory)
list_of_csvs = glob.glob('*.{}'.format("csv"))
regex = compile(r'time_'); list_of_csvs = [i for i in list_of_csvs if regex.search(i)]
print("The following CSVs will be looped through...")
print(list_of_csvs)
The following CSVs will be looped through...
['time_0xZn_1_Ch1_240401_094500.csv', 'time_0xZn_1_Ch1_240402_161000.csv', 'time_0xZn_1_Ch1_240403_091000.csv', 'time_0xZn_1_Ch1_240404_093000.csv', 'time_0xZn_1_Ch1_240405_093000.csv', 'time_0xZn_1_Ch1_240408_094500.csv', 'time_0xZn_1_Ch1_240517_131500.csv']
In [5]:
# define list to store width (number of columns) for each dataframe (these should always be equal)
df_widths = []
In [6]:
# create distances dataframe (same for every file in folder)
df_distances = pd.read_csv(list_of_csvs[0], header=None, names=["Distance","Amplitude"]).T
df_distances = df_distances[0:1]
df_distances = df_distances.reset_index()

df_distances_width = df_distances.shape[1]
df_widths.append(df_distances_width)
In [7]:
# loop through the files, transpose, and add single rows to the df_all_amplitures dataframe
df_all_amplitudes =  pd.DataFrame()

num_csvs_looped_though = 0

for current_csv_file in list_of_csvs:

    print("Processing: " + current_csv_file + " (" + str(num_csvs_looped_though+1) + " of " + str(len(list_of_csvs)) + ")")

    # get timestemp
    try:
        datetime_from_file_name = findall(r'_([0-9]{6}_[0-9]{5,})\.csv', current_csv_file)[0]
    except:
        ctypes.windll.user32.MessageBoxW(0, "The are CSV files (excluding raw_data.csv) that do not match the regular expression below:\n\n_([0-9]{6}_[0-9]{5,})\.csv", "Error", 1)

    year = datetime_from_file_name[0:2]
    month = datetime_from_file_name[2:4]
    day = datetime_from_file_name[4:6]
    time = findall(r'_([0-9]{5,})\.csv', current_csv_file)[0]
    hour = time[0:2]
    minute = time[2:4]
    full_timestamp = "20" + year + "-" + month + "-" + day + "T" + hour + ":" + minute + ":00+00:00"
    
    # read and transpose CSV
    df_amplitude = pd.read_csv(current_csv_file, header=None, names=["Distance","Amplitude"]).T
    df_amplitude = df_amplitude[1:2]
    df_amplitude = df_amplitude.reset_index()

    df_amplitude_width = df_amplitude.shape[1]
    df_widths.append(df_distances_width)

    # replace first (leftmost) value with timestamp
    df_amplitude['index'] = df_amplitude['index'].replace(['Amplitude'],full_timestamp)
    df_all_amplitudes = pd.concat([df_all_amplitudes, df_amplitude], ignore_index=True)
    
    # add to counter
    num_csvs_looped_though = num_csvs_looped_though + 1
Processing: time_0xZn_1_Ch1_240401_094500.csv (1 of 7)
Processing: time_0xZn_1_Ch1_240402_161000.csv (2 of 7)
Processing: time_0xZn_1_Ch1_240403_091000.csv (3 of 7)
Processing: time_0xZn_1_Ch1_240404_093000.csv (4 of 7)
Processing: time_0xZn_1_Ch1_240405_093000.csv (5 of 7)
Processing: time_0xZn_1_Ch1_240408_094500.csv (6 of 7)
Processing: time_0xZn_1_Ch1_240517_131500.csv (7 of 7)
In [8]:
# sort dataframe (so earliest readings are at top)
df_all_amplitudes = df_all_amplitudes.sort_values(['index'], ascending = [True]) 
In [9]:
# union distances (top row only) and all amplitudes dataframes
df_final = pd.concat([df_distances, df_all_amplitudes], ignore_index=True)
df_final['index'] = df_final['index'].replace(['Distance'],'')
df_final.head()
Out[9]:
index 0 1 2 3 4 5 6 7 8 ... 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199
0 0.000000 0.107207 0.214414 0.321622 0.428829 0.536036 0.643243 0.750450 0.857658 ... 127.576577 127.683784 127.790991 127.898198 128.005405 128.112613 128.219820 128.327027 128.434234 128.541441
1 2024-04-01T09:45:00+00:00 0.069553 0.036079 0.016572 0.010159 0.018783 0.014729 0.012764 0.014193 0.021974 ... 0.001055 0.002097 0.001326 0.001961 0.006971 0.011156 0.009833 0.008984 0.014437 0.031000
2 2024-04-02T16:10:00+00:00 0.063265 0.036869 0.016237 0.010420 0.018407 0.014863 0.012432 0.013837 0.021768 ... 0.000661 0.001665 0.000946 0.002118 0.006235 0.009632 0.009815 0.008587 0.015554 0.030573
3 2024-04-03T09:10:00+00:00 0.048215 0.032925 0.016320 0.010914 0.016880 0.014081 0.012564 0.013948 0.022045 ... 0.000173 0.001283 0.000691 0.001887 0.005185 0.006122 0.009726 0.010025 0.015649 0.032935
4 2024-04-04T09:30:00+00:00 0.070546 0.037625 0.017032 0.009807 0.019065 0.014699 0.012599 0.013919 0.021953 ... 0.000979 0.001985 0.001038 0.001870 0.006644 0.011179 0.009658 0.008618 0.014939 0.030755

5 rows × 1201 columns

In [10]:
# check dataframe widths are equal
df_widths = list(set(df_widths))
if len(df_widths) == 1:
    print("Dataframe widths are equal, no error")
else:
    ctypes.windll.user32.MessageBoxW(0, "Dataframe widths are unequal, all CSV files should have the same number of rows.", "Error", 1)
Dataframe widths are equal, no error
In [ ]:
# output CSV
output_file_name = "raw_data.csv"

try:
    os.remove(output_file_name)
except:
    pass

df_final.to_csv(output_file_name, index=False, header=False)
In [12]:
# generate and save plots
if generate_plot == True:
    import matplotlib.pyplot as plt

    def make_plot(truncate_dist):

        df_final_t = df_final.T.reset_index()
        df_final_t.columns = df_final_t.iloc[0]
        df_final_t = df_final_t[1:]
        df_final_t = df_final_t.rename(columns={df_final_t.columns[1]: 'Distance'})
        df_final_t = df_final_t.drop(['index'], axis=1)
        df_final_t = df_final_t.iloc[:, [0, 1, -1]]  # get first, second, and last columns
        
        if truncate_dist == True:
            max_index_first = pd.to_numeric(df_final_t.iloc[:, 1], errors='coerce')
            max_index_first = max_index_first.idxmax()

            max_index_last = pd.to_numeric(df_final_t.iloc[:, 2], errors='coerce')
            max_index_last = max_index_last.idxmax()

            maxs_added = max_index_first + max_index_last
            df_final_t = df_final_t.iloc[:maxs_added, :]

            
        formatted_time_first = df_final_t.columns[1]
        formatted_time_first= formatted_time_first.replace('T',' ') 
        formatted_time_first = formatted_time_first.replace(':00+00:00','') 

        formatted_time_last = df_final_t.columns[2]
        formatted_time_last= formatted_time_last.replace('T',' ') 
        formatted_time_last = formatted_time_last.replace(':00+00:00','') 

        plt.figure(figsize=(14, 8))

        # Plot first recent reading
        plt.scatter(df_final_t['Distance'], df_final_t.iloc[:, 1], label='First reading')
        plt.plot(df_final_t['Distance'], df_final_t.iloc[:, 1], linestyle='-', linewidth=0.5, color='blue', alpha=0.5)

        # Plot most recent reading
        plt.scatter(df_final_t['Distance'], df_final_t.iloc[:, 2], label='Most recent reading')
        plt.plot(df_final_t['Distance'], df_final_t.iloc[:, 2], linestyle='-', linewidth=0.5, color='red', alpha=0.5)


        plt.xlabel(df_final_t.columns[0])  # Label for x-axis (first column name)
        plt.ylabel("Amplitude")  # Label for y-axis (second column name)
        plt.ylim(bottom=0)

        chart_title = "Amplitudes at " + formatted_time_first + " (first reading) and " + formatted_time_last + " (most recent reading)"
        if truncate_dist == True:
            chart_title = chart_title + " [DISTANCE TRUNCATED]"

        plt.title(chart_title)
        plt.legend()

        image_title = "Most recent and first readings"
        if truncate_dist == True:
            image_title = image_title + " (distance truncated)"
        image_title = image_title + ".png"
        
        try:
            os.remove(image_title)
        except:
            pass
        plt.savefig(image_title, format='png', dpi=300)
        #plt.show()

    make_plot(truncate_dist=False)
    try:
        make_plot(truncate_dist=True)  
    except:
        pass
No description has been provided for this image
No description has been provided for this image